# Required packages
required_packages <- c("dplyr", "janitor", "knitr", "readxl","tidyr", "forcats", "broom", "ggplot2", "MASS", "haven", "reshape2", "stringr", "lubridate")

check_and_install <- function(packages) {
  for (pkg in packages) {
    if (!require(pkg, character.only = TRUE)) {
      install.packages(pkg)
      library(pkg, character.only = TRUE)
    }
  }
}

check_and_install(required_packages)

LGD_GP <- read.csv("All India Village to GP LGD codes.csv")


LGD_GP$Gram.Panchayat.LGD.Code[LGD_GP$Gram.Panchayat.LGD.Code == "" | is.na(LGD_GP$Gram.Panchayat.LGD.Code)] <- NA

# Replace missing or empty values in Gram.Panchayat.Name with "No Recorded GP"
LGD_GP$Gram.Panchayat.Name[LGD_GP$Gram.Panchayat.Name == "" | is.na(LGD_GP$Gram.Panchayat.Name)] <- "Place Present in LGD Dataset But Empty Local Body Code Field"


LGD_GP$Village.Census.2011.Code <- sprintf("%06d", as.numeric(LGD_GP$Village.Census.2011.Code))





LGD_GP$Gram.Panchayat.LGD.Code[LGD_GP$Gram.Panchayat.LGD.Code == "" | is.na(LGD_GP$Gram.Panchayat.LGD.Code)] <- NA

# Replace missing or empty values in Gram.Panchayat.Name with "No Recorded GP"
LGD_GP$Gram.Panchayat.Name[LGD_GP$Gram.Panchayat.Name == "" | is.na(LGD_GP$Gram.Panchayat.Name)] <- "Place Present in LGD Dataset But Empty Local Body Code Field"

length(LGD_GP$Village.Census.2011.Code)

summary(LGD_GP)

LGD_GP_compact <- LGD_GP[!duplicated(LGD_GP$Village.Census.2011.Code), 
                   c("Village.Census.2011.Code", "Gram.Panchayat.LGD.Code", "Gram.Panchayat.Name")]


############


#####LOAD SHRUG shrid names file

shrug<- read.csv("shrid_loc_names.csv")

shrug$pc11_shrid_id <- substr(shrug$shrid2, 17, 22)

# Merge Shrug and LGD


shrug_LGD <- merge(
  shrug, 
  LGD_GP_compact, 
  by.x = "pc11_shrid_id", 
  by.y = "Village.Census.2011.Code", 
  all.x = TRUE
)

# Replace missing or empty values in Gram.Panchayat.Name with "No Recorded GP"
shrug_LGD$Gram.Panchayat.Name[shrug_LGD$Gram.Panchayat.Name == "" | shrug_LGD$Gram.Panchayat.Name == " " |is.na(shrug_LGD$Gram.Panchayat.Name)] <- "Present in Shrug but Missing from LGD dataset"

no_name<- subset(shrug_LGD,Gram.Panchayat.Name=="Place Present in LGD Dataset But Empty Local Body Code Field")


# Identify Gram Panchayats with multiple villages
gp_multiple <- with(
  shrug_LGD, 
  Gram.Panchayat.LGD.Code %in% Gram.Panchayat.LGD.Code[duplicated(Gram.Panchayat.LGD.Code)]
)


shrug_LGD$shrid_part_of_multi_village_GP <- ifelse(
  is.na(shrug_LGD$Gram.Panchayat.LGD.Code),
  NA,
  ifelse(
    shrug_LGD$Gram.Panchayat.Name %in% c(
      "Present in Shrug but Missing from LGD dataset",
      "Place Present in LGD Dataset But Empty Local Body Code Field"
    ),
    "No",
    ifelse(gp_multiple, "Yes", "No")
  )
)


table(shrug_LGD$shrid_part_of_multi_village_GP)


## Load Urban Bodies

LGD_Urban <-read.csv("All India Villages by Urban Status and Urban Body.csv")

table(LGD_Urban$Village_Rural_or_Urban)

LGD_Urban<- subset(LGD_Urban, Village_Rural_or_Urban=="Urban"|Village_Rural_or_Urban=="Both")


LGD_Urban_compact <- LGD_Urban[!duplicated(LGD_Urban$Village_Code_2011), 
                               c("Village_Code_2011", "Local_Body_Code", "Local_Body_Name", "Local_Body_Type_Name")]


###########

### Merge with Urban Bodies


shrug_LGD <- merge(
  shrug_LGD, 
  LGD_Urban_compact, 
  by.x = "pc11_shrid_id", 
  by.y = "Village_Code_2011", 
  all.x = TRUE
)

summary(shrug_LGD)

#Dealing with empty rows
empty<- subset(shrug_LGD, Local_Body_Type_Name=="")

empty_rows <- which(shrug_LGD$Local_Body_Type_Name == "" | shrug_LGD$Local_Body_Type_Name == " ")

shrug_LGD$Local_Body_Type_Name[empty_rows] <- NA
shrug_LGD$Local_Body_Code[empty_rows] <- NA

# Dealing with non panchayati raj states

shrug_LGD$Local_Body_Type_Name[shrug_LGD$state_name %in% c("nagaland", "meghalaya", "mizoram", "nct of delhi", "chandigarh")] <- 
  "State Has No Panchayati Raj"


shrug_LGD$Local_Body_Name[shrug_LGD$state_name %in% c("nagaland", "meghalaya", "mizoram", "nct of delhi", "chandigarh")] <- 
  "State Has No Panchayati Raj"

# Count rows where both codes are not NA
both_codes_present <- sum(!is.na(shrug_LGD$Gram.Panchayat.LGD.Code) & !is.na(shrug_LGD$Local_Body_Code))

# Print the result
cat("Number of observations with both GP LGD code and Local Body Code:", both_codes_present, "\n")

both_codes_subset <- subset(
  shrug_LGD,
  !is.na(Gram.Panchayat.LGD.Code) & !is.na(Local_Body_Code)
)


shrug_LGD$LGD_code <- ifelse(
  !is.na(shrug_LGD$Local_Body_Code),
  shrug_LGD$Local_Body_Code,
  shrug_LGD$Gram.Panchayat.LGD.Code
)

shrug_LGD$local_body_name <- ifelse(
  !is.na(shrug_LGD$Local_Body_Name),
  shrug_LGD$Local_Body_Name,
  shrug_LGD$Gram.Panchayat.Name
)

shrug_LGD$local_body_type <- ifelse(
  !is.na(shrug_LGD$Local_Body_Type_Name),
  shrug_LGD$Local_Body_Type_Name,
  ifelse(
    shrug_LGD$Gram.Panchayat.Name %in% c(
      "Present in Shrug but Missing from LGD dataset",
      "Place Present in LGD Dataset But Empty Local Body Code Field"
    ),
    shrug_LGD$Gram.Panchayat.Name,
    ifelse(!is.na(shrug_LGD$Gram.Panchayat.Name), "Gram Panchayat", NA)
  )
)

shrug_LGD$gp_to_urban_conversion <- ifelse(
  !is.na(shrug_LGD$Gram.Panchayat.LGD.Code) & !is.na(shrug_LGD$Local_Body_Code),
  "Yes",
  ifelse(
    !is.na(shrug_LGD$Local_Body_Code),
    "No",
    ifelse(
      shrug_LGD$Gram.Panchayat.Name %in% c(
        "Present in Shrug but Missing from LGD dataset",
        "Place Present in LGD Dataset But Empty Local Body Code Field"
      ),
      NA,
      "No"
    )
  )
)

table(shrug_LGD$gp_to_urban_conversion)


shrug_LGD$old_gp_lgd_code <- ifelse(
  !is.na(shrug_LGD$Gram.Panchayat.LGD.Code) & !is.na(shrug_LGD$Local_Body_Code),
  shrug_LGD$Gram.Panchayat.LGD.Code,
  NA
)

shrug_LGD$old_gp_name <- ifelse(
  !is.na(shrug_LGD$Gram.Panchayat.LGD.Code) & !is.na(shrug_LGD$Local_Body_Code),
  shrug_LGD$Gram.Panchayat.Name,
  NA
)

summary(shrug_LGD)


shrug_LGD <- shrug_LGD[, -c(9, 10, 12, 13, 14)]
shrug_LGD <- shrug_LGD[, c(setdiff(names(shrug_LGD), "shrid_part_of_multi_village_GP"), "shrid_part_of_multi_village_GP")]

table(shrug_LGD$local_body_type)
no_pr<- subset(shrug_LGD, local_body_type=="State Has No Panchayati Raj")


write.csv(shrug_LGD, "shrug_LGD_matched.csv", row.names = FALSE)

summary(shrug_LGD)

## CHECK THOSE WITH NO RECORDED GP

# Exclude rows with "State Has No GPs"
valid_shrug_LGD <- subset(shrug_LGD, local_body_name != "State Has No Panchayati Raj")

# Subsets for status
missing_lgd<- subset(valid_shrug_LGD, local_body_type=="Present in Shrug but Missing from LGD dataset")
empty_field <- subset(valid_shrug_LGD, local_body_type == "Place Present in LGD Dataset But Empty Local Body Code Field")
matched_lgd <- subset(valid_shrug_LGD, !(local_body_type=="Present in Shrug but Missing from LGD dataset" | local_body_type == "Place Present in LGD Dataset But Empty Local Body Code Field"))

# Build per-state percentages
states_all <- sort(unique(valid_shrug_LGD$state_name))

total_counts <- table(valid_shrug_LGD$state_name)[states_all]
missing_counts <- table(factor(missing_lgd$state_name, levels = states_all))
empty_fieldcounts <- table(factor(empty_field$state_name, levels = states_all))
matched_counts <- table(factor(matched_lgd$state_name, levels = states_all))

# Final percentage table
percent_table <- data.frame(
  State_Name = states_all,
  Percent_Missing = round(100 * as.numeric(missing_counts) / as.numeric(total_counts), 2),
  Percent_Empty_Field = round(100 * as.numeric(empty_fieldcounts) / as.numeric(total_counts), 2),
  Percent_LGD_Matched = round(100 * as.numeric(matched_counts) / as.numeric(total_counts), 2)
)

# Overall matching stats
total_missing <- sum(missing_counts)
total_empty_field <- sum(empty_fieldcounts)
total_matched <- sum(matched_counts)
total_valid <- sum(total_counts)

overall_missing_rate <- round(100 * total_missing / total_valid, 2)
overall_empty_field_rate <- round(100 * total_empty_field / total_valid, 2)
overall_matched_rate <- round(100 * total_matched / total_valid, 2)

# Output
print(percent_table)
cat("Overall Present in Shrug but Missing from LGD Dataset Rate:", overall_missing_rate, "%\n")
cat("Overall Present in LGD Dataset but Empty LGD Field:", overall_empty_field_rate, "%\n")
cat("Overall Shrid to LGD Matched Rate:", overall_matched_rate, "%\n")




